{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "view-in-github"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/sugarforever/LangChain-Tutorials/blob/main/LangChain_PDF_Chatbot.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "该Python notebook利用langchain的QA chain，结合Chroma来实现PDF文档Analysis-and-Comparison-between-Optimism-and-StarkNet.pdf的语义化搜索。\n",
        "\n",
        "该PDF文档共61页。通过本notebook，我们演示该字数规模的文件的语义化索引的OpenAI API开销。\n",
        "\n",
        "使用时，在本地创建`.env`，并如`.env.example`所示，设置有效的OpenAI API Key即可。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Nifwi9FrKb3g"
      },
      "outputs": [],
      "source": [
        "%pip install openai > /dev/null\n",
        "%pip install chromadb > /dev/null\n",
        "%pip install langchain > /dev/null"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "5xgbUBve0LuN"
      },
      "outputs": [],
      "source": [
        "from langchain.document_loaders import PyMuPDFLoader"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UpOBdhBrdaiU"
      },
      "outputs": [],
      "source": [
        "PDF_NAME='Analysis-and-Comparison-between-Optimism-and-StarkNet.pdf'\n",
        "def load_pdf():\n",
        "  return PyMuPDFLoader(PDF_NAME).load()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_-1itVpTY8Gz"
      },
      "outputs": [],
      "source": [
        "docs = load_pdf()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "33zAThYjY9gA",
        "outputId": "bc6d338b-82e3-461f-be11-04b4f44af8fb"
      },
      "outputs": [],
      "source": [
        "print (f'You have {len(docs)} document(s) in your data')\n",
        "print (f'There are {len(docs[0].page_content)} characters in the first page of your document')\n",
        "\n",
        "total = 0\n",
        "for doc in docs:\n",
        "  total += len(doc.page_content)\n",
        "print (f'There are {total} characters in your document')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ziV20FzmZpm1"
      },
      "outputs": [],
      "source": [
        "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
        "\n",
        "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
        "split_docs = text_splitter.split_documents(docs)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "WlDqqN_6Z08T",
        "outputId": "c0f4d8f4-2840-41cf-c3fd-dd8da6b9a9fa"
      },
      "outputs": [],
      "source": [
        "print (f'Now you have {len(split_docs)} documents')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "6IOlsXpbaIAw"
      },
      "outputs": [],
      "source": [
        "from langchain.embeddings.openai import OpenAIEmbeddings\n",
        "from langchain.vectorstores import Chroma\n",
        "import os\n",
        "\n",
        "OPENAI_API_KEY = os.environ[\"OPENAI_API_KEY\"]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "bE-KtYTCgkLw"
      },
      "outputs": [],
      "source": [
        "embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hGdpt9LygkLy"
      },
      "outputs": [],
      "source": [
        "persist_directory = 'starknet'\n",
        "collection_name = 'starknet_index'"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from langchain.callbacks import get_openai_callback"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "1mQJJ8HRaf0I",
        "outputId": "d9343337-2df2-49dd-84d3-ac354ee65b7c"
      },
      "outputs": [],
      "source": [
        "with get_openai_callback() as cb:\n",
        "    vectorstore = Chroma.from_documents(split_docs, embeddings, collection_name=collection_name, persist_directory=persist_directory)\n",
        "    vectorstore.persist()\n",
        "    print(cb)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "yBAx1_X-beQp"
      },
      "outputs": [],
      "source": [
        "from langchain.llms import OpenAI\n",
        "from langchain.chains.question_answering import load_qa_chain"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "V8hds-zybhfc"
      },
      "outputs": [],
      "source": [
        "llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n",
        "\n",
        "chain = load_qa_chain(llm, chain_type=\"stuff\")\n",
        "\n",
        "# Load the vectorstore from disk\n",
        "vectordb = Chroma(collection_name=collection_name, persist_directory=persist_directory, embedding_function=embeddings)\n",
        "\n",
        "query = \"What is starknet?\"\n",
        "docs = vectorstore.similarity_search(query, 3, include_metadata=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print(chain.document_prompt)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "K3SlreQ2haC4",
        "outputId": "417ebd17-6bd3-431f-8699-3b9e81cd8911"
      },
      "outputs": [],
      "source": [
        "for doc in docs:\n",
        "    print(doc.metadata)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print(chain.prompt_length(docs, question='What is starknet?'))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zdF03LqphOQY",
        "outputId": "bc596972-7e39-479a-a921-5b17204f4b7f"
      },
      "outputs": [],
      "source": [
        "with get_openai_callback() as cb:\n",
        "    print(chain.run(input_documents=docs, question=query))\n",
        "    print(cb)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": []
    }
  ],
  "metadata": {
    "colab": {
      "include_colab_link": true,
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.16"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
